Application of Machine Learning for Aflatoxin Detection in Corn Kernels Utilizing Open Source Datasets

Dedy Leonardo Nadeak

Acknowlegment

Thank you for Chloe Siegel for allowing public to access the raw data.

DOI: 10.1016/j.foodcont.2023.109953, https://github.com/ChloeSiegel/kerneldata

Background

  • In this project, I aim to apply several machine learning techniques for binary classification using aflatoxin-contaminated corn kernel datasets. The algorithms I’ll explore are Partial Least Square Discriminant Analysis (PLS-DA), Random Forest (RF), Support Vector Machine (SVM), and Gradient Boosting Machine (GBM).

  • Separation high (HC) and low (LC) level aflatoxin contamination based on SNI 01-3929-2006, which the maximum level of aflatoxin for poultry feed in Indonesia is 50 ppb.

  • For this analysis, I focus exclusively on the SC212M x PHW79 corn hybrid. This dataset comprises 247 samples, with 107 classified as HC and 140 as LC.

Importing the dataset and remove the unused data.

#import the datasets
aflatoxin_data <- read.csv("kernel_data/spectralsignatures.csv", header = T)

#select the "SC212M x PHW79" samples
df <- subset(aflatoxin_data, aflatoxin_data$Hybrid == "SC212M x PHW79")

#Create a reference, and split as high and low contaminant 
AF_ref <- df[,c(1,3)]
AF_ref$contaminant <- ifelse(AF_ref$AF_level <= 50, "LC", "HC")
AF_ref$contaminant <- as.factor(AF_ref$contaminant)

#remove the second to forth columns
df <- df[,-c(2:4)]

#remove "stray light", which can introduce unwanted noise. Remove first 50 and last 50 wavenumber
df <- df[,-c(2:51, (ncol(df)-49):ncol(df))]

Plot the original data

Preprocessing data

Train and Test Dataset

#Split datasets using the Kennard-Stone (KS) algorithm
library(caTools)
set.seed(123)
spl <- sample.split(df$kernel_number, SplitRatio = 0.8)

#reference aflatoxin
AF_ref_train <- subset(AF_ref, spl == TRUE)
AF_ref_test <- subset(AF_ref, spl == FALSE)

#SNV
df_snv_train <- subset(df_snv, spl == TRUE)
df_snv_test <- subset(df_snv, spl == FALSE)

PLS-DA Determination

library(mdatools)
###First derivative
plsda_d1 <- plsda(df_d1_train[,-1], AF_ref_train$contaminant, 20, cv=10)
plsda_d1 <- selectCompNum(plsda_d1, 4)
pred_plsda_d1 <- predict(plsda_d1, df_d1_test[,-1], AF_ref_test$contaminant)
print(pred_plsda_d1$misclassified[,4])
   HC    LC Total 
 0.28  0.28  0.28 
Setting:

First Derivative with 4 number of components

Random Forest Determination

library(caret)
library(randomForest)
#SNV
#Random Forest with SNV datasets model using Hyperparameter tuning
rf_snv_train <- cbind(AF_ref_train$contaminant, df_snv_train[,-1])
names(rf_snv_train)[1] <- "contaminant"
tune_grid <- expand.grid(.mtry = c(1:10))
control <- trainControl(method = "cv", number = 10)
set.seed(123)
rf_snv_model <- train(contaminant ~ ., data = rf_snv_train, method = "rf", tuneGrid = tune_grid, trControl = control, ntree = 1000, nodesize = 1)
pred_rf_snv <- predict(rf_snv_model, newdata = df_snv_test[,-1])
print(table(pred_rf_snv, AF_ref_test$contaminant))
           
pred_rf_snv HC LC
         HC 12  6
         LC 12 20

Setting:
SNV Preprocessing
Number of tree : 1000
Node Size : 1
Number of variable: 5

Support Vector Machines (SVMs)

library(caret)
##SNV model
#Tuning the model
trainControl <- trainControl(method = "cv", number = 10)

# Define gamma values
gamma_values <- c(0.01, 0.1, 0.5, 1, 5, 10, 100)

# Convert gamma values to sigma values
sigma_values <- 1 / sqrt(2 * gamma_values)

# Define the tuning grid
tuneGrid <- expand.grid(C = c(0.5, 1, 10, 100, 1000), sigma = sigma_values)
set.seed(123)
svm_snv_model <- train(contaminant ~ ., data = rf_snv_train, 
               method = "svmRadial",
               trControl = trainControl,
               tuneGrid = tuneGrid)
pred_svm_snv <- predict(svm_snv_model, df_snv_test[,-1])
table(pred_svm_snv, AF_ref_test$contaminant)
            
pred_svm_snv HC LC
          HC  0  0
          LC 24 26

Setting:
SNV Preprocessing
Gamma : 0.01
Cost : 0.5

Gradien Boosting Machine (GBM)

library(caret)
library(gbm)
# Define the training control
train_control <- trainControl(method = "cv", number = 10)

# Define the hyperparameter grid
tune_grid <- expand.grid(
  n.trees = c(100, 200, 300, 500, 1000),
  interaction.depth = c(1, 3, 5),
  shrinkage = c(0.01, 0.1, 0.3),
  n.minobsinnode = c(10, 20)
)

#SNV model
set.seed(123)
gbm_snv_model <- train(
  contaminant ~ ., data = rf_snv_train,
  method = "gbm",
  trControl = train_control,
  tuneGrid = tune_grid,
  verbose = FALSE
)
pred_gbm_snv <- predict(gbm_snv_model, df_snv_test[,-1])
table(pred_gbm_snv, AF_ref_test$contaminant)
            
pred_gbm_snv HC LC
          HC 12  7
          LC 12 19

Setting:
SNV Preprocessing
Number of Trees : 100
Interaction Depth : 5
Learning Rate : 0.3
Min. Observation in Node: 20

Conclusion